Author

Peter Youyun Zheng

Published

July 22, 2024

Introduction

In this doc, we will be looking at the banksy clusters and try to interpret them based on their identity, and other characteristics.

Load the data

The objects are here:

  1. RDS object with Leiden Clusters: /xchip/beroukhimlab/coja/Spatial_PLGG/data/Xenium/Xenium_Objects/total_spatial_banksy_clusters_20240711_135041.rds

    • /xchip/beroukhimlab/coja/Spatial_PLGG/data/Xenium/Xenium_Objects/total_spatial_banksy_embeddings/total_spatial_banksy_clusters_subset_20240711_135041.rds
  2. RDS object with cell type cluster markers: /xchip/beroukhimlab/coja/Spatial_PLGG/data/Xenium/Xenium_Objects/total_spatial_banksy_clusters_markers_20240711_135041.rds

Metadata is here:

  1. /Users/youyun/Documents/HMS/PhD/beroukhimlab/broad_mount/coja/Spatial_PLGG/data/metadata/Xenium_PS.xlsx
Code
# banksy_embeddings = readRDS(paste0(
#     workdir,'coja/Spatial_PLGG/data/Xenium/Xenium_Objects/total_spatial_banksy_clusters_subset_20240711_135041.rds'
# ))
# colData(banksy_embeddings)$sample_id = factor(colData(banksy_embeddings)$sample_id)

# # generate some leiden clusters at varying resolution
# lambda = c(0.2, 0.8)
# SEED = 55555
# banksy_embeddings <- Banksy::clusterBanksy(
#     banksy_embeddings, dimred = "Harmony_BANKSY_lam0.2", resolution = c(1,0.5), 
#     algo = 'leiden',seed = SEED
# )
# banksy_embeddings <- Banksy::clusterBanksy(
#     banksy_embeddings, dimred = "Harmony_BANKSY_lam0.8", resolution = c(1,0.5), 
#     algo = 'leiden',seed = SEED
# )

# # getting rid of the previously generated clusters because their resolution is too high
# colData(banksy_embeddings)[,c(
#     'clust_Harmony_BANKSY_lam0.2_k50_res1.2','clust_Harmony_BANKSY_lam0.8_k50_res1.2'
# )] = NULL

# # Different clustering runs can be relabeled to minimise their differences with connectClusters:
# banksy_embeddings <- Banksy::connectClusters(banksy_embeddings, map_to = clusterNames(banksy_embeddings)[1])

# # save the clusters
# saveRDS(banksy_embeddings, paste0(
#     workdir,'coja/Spatial_PLGG/data/Xenium/Xenium_Objects/total_spatial_banksy_clusters_subset_20240711_135041_reclustered.rds'
# ))

banksy_embeddings = readRDS(paste0(
    workdir,'coja/Spatial_PLGG/data/Xenium/Xenium_Objects/total_spatial_banksy_clusters_subset_20240711_135041.rds'
))
colData(banksy_embeddings)$sample_id = factor(colData(banksy_embeddings)$sample_id)

cell_type_markers = fread(paste0(
    workdir,'coja/Spatial_PLGG/data/markers/final_manual_markers.csv'
))
xenium_markers = fread(paste0(
    workdir, 'youyun/plgg/data/xenium_selection/Xenium_hBrain_v1_metadata.csv'
))

Looking at the harmony corrected embeddings

We will first look at the extent of batch effect in the cohort by looking at the embeddings before and after harmony correction.

Code
plot_grid(
    ggdraw() + draw_label("BANKSY Embedding UMAP Before/After Harmony (Cell Identity)", fontface='bold'),
    plot_grid(
        plotReducedDim(
            banksy_embeddings, "UMAP_M1_lam0.2", point_size = 0.1,
            point_alpha = 0.5, color_by = "sample_id"
        ) +
            theme(legend.position = "none"),
        plotReducedDim(
            banksy_embeddings, "UMAP_Harmony_BANKSY_lam0.2", 
            point_size = 0.1,point_alpha = 0.5, color_by = "sample_id"
        ) +
            theme(legend.title = element_blank()) +
            guides(colour = guide_legend(override.aes = list(size = 5, alpha = 1))),
        nrow = 1,
        rel_widths = c(1, 1.2)
    ), ncol = 1,rel_heights = c(0.1, 5)
)

Code
plot_grid(
    ggdraw() + draw_label("BANKSY Embedding UMAP Before/After Harmony (Neighborhood)", fontface='bold'),
    plot_grid(
        plotReducedDim(
            banksy_embeddings, "UMAP_M1_lam0.8", point_size = 0.1,
            point_alpha = 0.5, color_by = "sample_id"
        ) +
            theme(legend.position = "none"),
        plotReducedDim(
            banksy_embeddings, "UMAP_Harmony_BANKSY_lam0.8", 
            point_size = 0.1,point_alpha = 0.5, color_by = "sample_id"
        ) +
            theme(legend.title = element_blank()) +
            guides(colour = guide_legend(override.aes = list(size = 5, alpha = 1))),
        nrow = 1,
        rel_widths = c(1, 1.2)
    ), ncol = 1,rel_heights = c(0.1, 5)
)

Generate the clusters at different resolution and look at them

It seems like resolution = 1 is still too coarse for leiden clusteringsp we are going back to 1.2

Code
plot_grid(
    ggdraw() + draw_label("BANKSY Embedding UMAP for Cell Identity(L) and Niche(R)", fontface='bold'),
    plot_grid(
        plotReducedDim(
            banksy_embeddings, "UMAP_Harmony_BANKSY_lam0.2", 
            point_size = 0.1, point_alpha = 0.5, 
            color_by = "clust_Harmony_BANKSY_lam0.2_k50_res1.2"
        ) +
            theme(legend.title = element_blank()) +
            guides(colour = guide_legend(override.aes = list(size = 5, alpha = 1))),
        plotReducedDim(
            banksy_embeddings, "UMAP_Harmony_BANKSY_lam0.8", 
            point_size = 0.1, point_alpha = 0.5, 
            color_by = "clust_Harmony_BANKSY_lam0.8_k50_res1.2"
        ) +
            theme(legend.title = element_blank()) +
            guides(colour = guide_legend(override.aes = list(size = 5, alpha = 1))),
        nrow = 1,
        rel_widths = c(1, 1)
    ), ncol = 1,rel_heights = c(0.1, 5)
)

Just to get an idea of where each cluster is, we can look at the UMAPs colored individually by the clusters.

Code
num_cell_cluster = max(as.numeric(colData(banksy_embeddings)$clust_Harmony_BANKSY_lam0.2_k50_res1.2))
# one hot encoded columns
one_hot_encoded_clusters = data.frame(dcast(data.table(
    cell_sample_id = rownames(colData(banksy_embeddings)),
    clusters = paste0('cell_in_',colData(banksy_embeddings)$clust_Harmony_BANKSY_lam0.2_k50_res1.2)
), cell_sample_id ~ clusters, length)[order(match(cell_sample_id,rownames(colData(banksy_embeddings))))])
# cbind into the colData
colData(banksy_embeddings) = cbind(
    colData(banksy_embeddings),
    one_hot_encoded_clusters
)
all(rownames(colData(banksy_embeddings)) == colData(banksy_embeddings)$cell_sample_id)

# make the plots
a = lapply(1:num_cell_cluster, function(x){
    assign(
        paste0('cell_type_plot_',x),
        plotReducedDim(
            banksy_embeddings, "UMAP_Harmony_BANKSY_lam0.2", 
            point_size = 0.1, point_alpha = 0.5, 
            color_by = paste0('cell_in_',x)
        )  + theme(
            legend.position = "none",
            axis.text=element_blank(),
            axis.ticks=element_blank(),
            axis.title=element_blank()
        ) + labs(title = paste0('Cluster ',x)),
        envir = .GlobalEnv
    )
})

plot_grid(
    ggdraw() + draw_label("BANKSY Embedding UMAP by Cluster", fontface='bold'),
    plot_grid(
        plotlist = mget(paste0('cell_type_plot_',1:num_cell_cluster)),
        nrow = 4,
        rel_widths = rep(1,num_cell_cluster)
    ), ncol = 1,rel_heights = c(0.1, 5)
)

[1] TRUE

What are the cell types in the clusters?

Manually- and Xenium- annotated cell type markers

Code
kable(rbind(cell_type_markers[
    category == 'Cell Type' & xenium == TRUE,
    .(markers = paste0(marker, collapse = ', '), source = 'manual'),.(annotation)
],xenium_markers[
    ,.(markers = paste0(Genes, collapse = ', '), source = 'xenium'),.(annotation = Annotation)
]))

markers_to_score = rbind(cell_type_markers[
    category == 'Cell Type' & xenium == TRUE,
    .(markers = marker),.(annotation = paste0(annotation, '_manual'))
],xenium_markers[
    ,.(markers = Genes),.(annotation = paste0(Annotation, '_xenium'))
])[
    ,annotation := gsub(' ','_',annotation)
]

# Scoring the cells by cell type markers
cell_types = unique(markers_to_score$annotation)
marker_score = do.call('cbind',lapply(cell_types, function(x){
    genes_of_interest = markers_to_score[annotation == x]$markers
    cell_scores = log10(colSums(
        assay(banksy_embeddings[genes_of_interest,], 'normcounts')
    )/length(genes_of_interest) + 1)
    cell_scores_df = data.frame(cell_scores)
    rownames(cell_scores_df) = names(cell_scores)
    colnames(cell_scores_df) = x
    cell_scores_df
}))
colData(banksy_embeddings)[,cell_types] = NULL
if(all(rownames(colData(banksy_embeddings)) == rownames(colData(banksy_embeddings)))){
    colData(banksy_embeddings) = cbind(colData(banksy_embeddings),marker_score)
}else{
    stop('Cell IDs do not match')
}

# make the plots
a = lapply(cell_types, function(x){
    assign(
        paste0('plot_',x),
        plotReducedDim(
            banksy_embeddings, "UMAP_Harmony_BANKSY_lam0.2", 
            point_size = 0.1, point_alpha = 0.5, 
            color_by = x
        ) +
            theme(legend.title = element_blank(), legend.position = 'bottom') +
            guides(colour = guide_legend(override.aes = list(size = 5, alpha = 1), nrow = 1)) +
            labs(title = x),
        envir = .GlobalEnv
    )
})
annotation markers source
Stromal S100A4, DCN manual
OPC OLIG2, PDGFRA, OLIG1, SOX10 manual
Progenitor Glia TOP2A manual
Oligodendrocyte MAG, MOG, OLIG1, MYRF manual
Endothelial PECAM1 manual
Microglia AIF1, PTPRC manual
Astrocyte AQP4, APOE manual
T_cell PTPRC, CD52 manual
RGC PAX6, HES1, CDH4, PTPRZ1 manual
VLMC ABCC9, ADAMTS12, ANO3, CDH6, CEMIP, COL12A1, CSPG4, DCN, FBLN1, IGFBP4, LAMA2, MCTP2, NR2F2, PHLDB2, PLCE1, RFTN1, SERPINA3, TMEM132C xenium
L4 IT ADAMTS16, FSTL4, OTOGL, POU6F2, RORB, SLC17A6 xenium
L6 IT ADAMTS3, C1QL3, SNTB2 xenium
Sncg ADRA1A, ADRA1B, LYPD6B, PROX1, THSD7B xenium
Glioblastoma (TME) AIF1, C1orf162, CAPG, CCL5, CD14, CD163, CD2, CD3G, CD4, CD48, CD52, CD68, CD83, CORO1A, CTSS, CX3CR1, CXCR4, CYTIP, FCER1G, FCGR1A, FCGR3A, GNLY, GPNMB, GPR183, GPR34, GZMA, HLA-DMB, IDO1, IL7R, ITGB2, KLF2, KLF4, KLRB1, LY86, MS4A6A, NKG7, NR4A2, RGS10, RNASET2, S100A4, STXBP2, TGFBI, TMIGD3, TRAC xenium
Chandelier ALK, ANK1, CNTNAP3B, LHX6, NPNT, PVALB, SDK1, TENM1, UNC5B xenium
Pax6 ANGPT1, CCK, CDH4, CXCL14, DDR2, PLD5, RELN, SFRP2, WIF1 xenium
Sst Chodl ANKRD18A, CRHBP, NDST4, NXPH2, SST, TAC1, TACR1, TRPC6 xenium
Glioblastoma (Cancer cells) ANXA1, B4GALNT1, BCAN, CAV1, CHODL, EGFR, ELOVL2, HES1, HILPDA, IDH1, IDH2, IGFBP3, IGFBP5, LOX, MGST1, NNAT, PSEN2, PSENEN, SOX11, SOX2, SOX4, TP53, TRIL, TTYH1 xenium
Microglia-PVM APOE, ARHGAP24, CCL4, CD86, CTSH, FASLG, HLA-DQA1, IFITM3, ITGAM, ITGAX, LRRK1, LYVE1, P2RY12, P2RY13, PTPRC, RGS16, SPI1, TGFB1, TREM2 xenium
Broad APP xenium
Astrocyte AQP4, FGFR3, GJA1, MEIS2, NES, PAX6, RYR3, SOX9, SPON1, TGFB2, THBS1 xenium
L5/6 NP ATP2C2, CD36, CRYM, HTR2C, TPH2, TSHZ2, VWC2L xenium
OPC BRINP3, CALCRL, ITGA8, LRRK2, OLIG1, OLIG2, PDGFRA, PTPRZ1, SEMA5A, SOX10, STK32B, VCAN xenium
Pvalb BTBD11, GAD1, MEPE, MYO5B, RSPO2, SAMD5, SULF1 xenium
Oligodendrocyte CAPN3, CDH1, CLDN11, CNDP1, CNTN2, CTNNA3, EFHD1, ERBB3, ERMN, FGFR2, HHATL, KLK6, MAG, MAL, MOBP, MOG, MYRF, NCSTN, OPALIN, PCSK6, PSEN1, ST18, UGT8 xenium
Proliferation CCNA1, CCNB2, CDK1, CENPF, KIT, MKI67, PCNA, TOP2A xenium
L6 IT Car3 CDH12, GAS2L3, NPY1R, NTNG2, NWD2, POSTN, RASGRP1, SLC26A4, THEMIS, TRPC5, ZDHHC23 xenium
Endothelial CEMIP2, FLT1, NOTCH1, NRP1, PECAM1, RNF144B, STAT3, THSD4 xenium
Sst COL25A1, PLCH1, SYNPR, TRHDE xenium
L2/3 IT CUX2, TESPA1, ZBBX xenium
Lamp5 DNER, PTCHD4 xenium
Lamp5 Lhx6 EYA4, GAD2, LAMP5, LYPD6, MYO16, NTNG1, PDGFD, SPHKAP xenium
L6 CT FILIP1, HS3ST2, HS3ST4, KCNH5 xenium
L5 IT HTR2A, RXFP1 xenium
L6b NPFFR2, ROS1, SLC17A7 xenium
L5 ET PCSK1, RIT2, SLIT3, SNCG, SORCS1 xenium
Vip SLC24A3, VIP xenium

We are going to annotate 37 cell types. and we will now visualize the marker scores.

Code
plot_grid(
    ggdraw() + draw_label("BANKSY Embedding UMAP by Manually Curated Markers", fontface='bold'),
    plot_grid(
        plotlist = mget(paste0('plot_',grep('_manual',cell_types, value = TRUE))),
        nrow = 3,
        rel_widths = rep(1,sum(grepl('_manual',cell_types)))
    ), ncol = 1,rel_heights = c(0.1, 5)
)

Code
plot_grid(
    ggdraw() + draw_label("BANKSY Embedding UMAP by Xenium Markers", fontface='bold'),
    plot_grid(
        plotlist = mget(paste0('plot_',grep('_xenium',cell_types, value = TRUE))),
        nrow = 5,
        rel_widths = rep(1,sum(grepl('_xenium',cell_types)))
    ), ncol = 1,rel_heights = c(0.1, 5)
)

We should complement the manual approach with more statistics

Finding Statistical Markers

A better way to do this is to test statistically, which clusters are enriched for which signature. We will do that here

Code
cell_type_cluster_sig = do.call('rbind',lapply(paste0('cell_in_',1:num_cell_cluster), function(x){
    scran_cell_markers = findMarkers(
        as.matrix(t(colData(banksy_embeddings)[,cell_types])),
        groups = colData(banksy_embeddings)[,x],
        test.type="wilcox", direction="up"
    )    
    delta_dt = melt(setDT(
        as.data.frame(colData(banksy_embeddings)[,c(cell_types,x)]),
        keep.rownames = 'cell_sample_id'
    ), id.vars = c('cell_sample_id',x), variable.name = 'cell_type', value.name = 'score')[
        ,.(delta_score = mean(score[get(x) == 1]) - mean(score[get(x) == 0])),.(cell_type)
    ]
    cluster_cell_type = setDT(
        data.frame(scran_cell_markers[[2]]),
        keep.rownames = 'cell_type'
    )[,.(cell_type = make.names(cell_type), Top, p.value, FDR, cluster = x)]
    merge(
        cluster_cell_type, delta_dt, by = 'cell_type'
    )
}))

# make a hierarchically clustered heatmap
cell_type_cluster_sig_matrix = dcast(
    cell_type_cluster_sig[,.(cell_type, cluster, FDR = ifelse(FDR<0.05,'*',''))],
    cluster ~ cell_type, value.var = 'FDR'
)
cell_type_cluster_delta_matrix = dcast(
    cell_type_cluster_sig[,.(cell_type, cluster, delta_score)],
    cluster ~ cell_type, value.var = 'delta_score'
)
plot_sig_mt = as.matrix(cell_type_cluster_sig_matrix[,c(2:ncol(cell_type_cluster_sig_matrix)), with = FALSE])
plot_delta_mt = as.matrix(cell_type_cluster_delta_matrix[,c(2:ncol(cell_type_cluster_delta_matrix)), with = FALSE])
rownames(plot_sig_mt) = cell_type_cluster_sig_matrix$cluster
rownames(plot_delta_mt) = cell_type_cluster_delta_matrix$cluster
col_fun = colorRamp2(c(-0.3,0,0.3), c("green", "white", "red"))
Heatmap(
    plot_delta_mt, name = "Delta Mean Norm Counts", col = col_fun,
    cell_fun = function(j, i, x, y, width, height, fill) {
        grid.text(sprintf("%s", plot_sig_mt[i, j]), x, y, gp = gpar(fontsize = 10))
})

Another way is to do DE of one vs all and find marker genes

Code
library(Seurat)
banksy_embeddings_seurat = as.Seurat(banksy_embeddings, counts = 'counts', data = NULL)
Warning: Keys should be one or more alphanumeric characters followed by an underscore, setting key from PC to PC_
Keys should be one or more alphanumeric characters followed by an underscore, setting key from PC to PC_
Warning: Key 'PC_' taken, using 'pcam1lam02_' instead
Warning: Keys should be one or more alphanumeric characters followed by an underscore, setting key from PC to PC_
Warning: Key 'PC_' taken, using 'pcam1lam08_' instead
Warning: Keys should be one or more alphanumeric characters followed by an underscore, setting key from PC to PC_
Warning: Key 'PC_' taken, using 'harmonybanksylam02_' instead
Warning: Keys should be one or more alphanumeric characters followed by an underscore, setting key from PC to PC_
Warning: Key 'PC_' taken, using 'harmonybanksylam08_' instead
Warning: Keys should be one or more alphanumeric characters followed by an underscore, setting key from UMAP_M1_lam0_
to UMAPM1lam0_
Warning: Keys should be one or more alphanumeric characters followed by an underscore, setting key from UMAP_M1_lam0.2_
to UMAPM1lam02_
Warning: Keys should be one or more alphanumeric characters followed by an underscore, setting key from UMAP_M1_lam0.8_
to UMAPM1lam08_
Warning: Keys should be one or more alphanumeric characters followed by an underscore, setting key from
UMAP_Harmony_BANKSY_lam0.2_ to UMAPHarmonyBANKSYlam02_
Warning: Keys should be one or more alphanumeric characters followed by an underscore, setting key from
UMAP_Harmony_BANKSY_lam0.8_ to UMAPHarmonyBANKSYlam08_
Code
DE_genes = FindAllMarkers(
    banksy_embeddings_seurat,
    slot = 'counts',
    test.use = 'DESeq2',
    only.pos = TRUE
)
Calculating cluster SingleCellExperiment
Warning: No DE genes identified
Warning: The following tests were not performed:
Warning: When testing SingleCellExperiment versus all:
    Cells in one or both identity groups are not present in the data requested

With the information above, we can begin to piece together some preliminary definition for each cluster.

Code
prelim_annotation = data.table(
    cluster = 1:num_cell_cluster,
    annotation = c(

    )
)

Looking at the cell types + niches in space

Code
graph_dt = melt(
    as.data.table(cbind(
        spatialCoords(banksy_embeddings),
        colData(banksy_embeddings)[c(
            'sample_id','cell_id',
            grep('Harmony.*.res1.2$',clusterNames(banksy_embeddings), value = TRUE)
        )]
    )),id.vars = c('sample_id','cell_id','sdimx','sdimy'),
    variable.name = 'Cluster Type',value.name = 'Cluster ID'
)[,sample_id := gsub('output-XETG00078__','',sample_id)]
graph_dt

cluster_classes = sort(unique(graph_dt$`Cluster ID`))
color_manual = structure(
    pals::polychrome()[c(1:length(cluster_classes))],
    names = cluster_classes
)

graph_dt[,.(min_x = min(sdimx)), by = c('sample_id')][order(min_x)]

ggplot(graph_dt, aes(x=sdimy, y=sdimx, color=`Cluster ID`)) +
    facet_wrap(`Cluster Type`~sample_id, nrow = 2, scale = 'free_x') +
    geom_point(size = 0.1, alpha = 0.5) + 
    scale_color_manual(values = color_manual) +
    theme_classic() + 
    theme(
        legend.position = "bottom",
        axis.text.x=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks=element_blank()
    ) + guides(
        color = guide_legend(nrow = 3, byrow = TRUE)
    ) +
    coord_flip()

    sample_id     min_x
       <char>     <num>
 1:     42112  14616.65
 2:     51420  28491.42
 3:     53398  42717.19
 4:     67068  57325.54
 5:     87352  71228.25
 6:    188570  85592.35
 7:    240468  99684.05
 8:    241670 113911.71
 9:    258482 128370.33
10:    320784 142408.38
11:    328368 156628.75